In [73]:
import csv
import pandas as pd

# Clean data
#
# From https://en.wikipedia.org/wiki/List_of_public_corporations_by_market_capitalization
# Data Source : https://docs.google.com/spreadsheets/d/1NxEouZb0AHGZ3OcwUPp8Ga1LmBjp37NABtwdcYSE91I/edit#gid=115560802

# Visualization
# http://blockbuilder.org/clemsos/4923d7601f5e4b208b740123679047f7
In [74]:
# parse fist CSV
first = "raw/Data - List of public corporations by market capitalization - 2007-2006.csv"

df = pd.read_csv(first)
df.dropna(how="all", inplace=True) # drop empty lines

_id = df["Year"].astype(int).map(str) + "-" + df["Q"]+"-"+df["Rank"].astype(int).map(str)
q = df["Name"].str.split('  ', expand=True)
final_first = pd.concat([_id, q, df], axis=1, join='inner')
final_first = final_first.drop(['Name'], axis=1)

# rename columns
final_first.columns.values[0] = 'Id'
final_first.columns.values[1] = 'Company'
final_first.columns.values[2] = 'Company Link'
final_first = final_first.rename(index=str, columns={"Q": "Quarter"})
final_first["Headquarters Link"] = None

final_first.head()

print final_first.shape
(80, 10)
In [75]:
# parse properly second CSV
second = "raw/Data - List of public corporations by market capitalization - 2008-2017.csv"
df = pd.read_csv(second)

df.dropna(how="all", inplace=True) # drop empty lines

print df.shape

col_names = [
    ('Q1', 'Q1 (Country)'),
    ('Q2', 'Q2 (Country)'),
    ('Q3', 'Q3 (Country'),
    ('Q4', 'Q4 (Country)')
]

final_second = pd.DataFrame()

for cols in col_names:
    
    _id = df["Year"].astype(int).map(str) + "-" + cols[0]+"-"+df["Rank"].astype(int).map(str)
    
    q = df[cols[0]].str.split('  ', expand=True)
    q.columns = ["Company", "Company Link", "Market value", "Notes"]
    
    result = pd.concat([_id, q], axis=1, join='inner')
    
    result.columns.values[0] = 'Id'
    result["Year"] = df["Year"].astype(int)
    result["Quarter"]= cols[0] # add quarter
    result["Rank"] = df["Rank"].astype(int)
    
    result["Headquarters"] = df[cols[1]].str[1:-1].str.split('/', expand=True)[4]
    result["Headquarters Link"] = df[cols[1]]
    
    result["Primary industry"] = None
    
    print result.shape

    final_second = final_second.append(result)

print final_second.shape
final_second.head()
(100, 10)
(100, 11)
(100, 11)
(100, 11)
(100, 11)
(400, 11)
Out[75]:
Id Company Company Link Market value Notes Year Quarter Rank Headquarters Headquarters Link Primary industry
0 2017-Q1-3 Microsoft (https://en.wikipedia.org/wiki/Microsoft) 508,935 None 2017 Q1 3 United_States (https://en.wikipedia.org/wiki/United_States) None
1 2017-Q1-1 Apple Inc. (https://en.wikipedia.org/wiki/Apple_Inc.) 753,718 None 2017 Q1 1 United_States (https://en.wikipedia.org/wiki/United_States) None
2 2017-Q1-4 Amazon.com (https://en.wikipedia.org/wiki/Amazon.com) 423,031 None 2017 Q1 4 United_States (https://en.wikipedia.org/wiki/United_States) None
3 2017-Q1-2 Alphabet Inc. (https://en.wikipedia.org/wiki/Alphabet_Inc.) 573,570 [14] (#cite_note-ya-15) 2017 Q1 2 United_States (https://en.wikipedia.org/wiki/United_States) None
4 2017-Q1-5 Berkshire Hathaway (https://en.wikipedia.org/wiki/Berkshire_Hatha... 410,880 [15] (#cite_note-yb-16) 2017 Q1 5 United_States (https://en.wikipedia.org/wiki/United_States) None
In [76]:
# fetch missing values from Wikipedia
import os
import json
import wptools
from slugify import slugify

# get unique values
companies = list(final_first.append(final_second)['Company'].unique())
print "%s companies"%len(companies)

# Count files in folder
# print len(os.listdir('./companies'))

companies_info = {}

for company_name in companies:
    
    if type(company_name) is str  :
        
        json_file_name = "companies/%s.json"%slugify(company_name.decode('utf-8'))

        if not os.path.exists(json_file_name): 
            print "fetching %s..."%company_name
            # fetch infobox
            page = wptools.page(company_name).get_parse()
            infobox = page.data['infobox']

            #save infobox as json file

            print json_file_name
            with open(json_file_name, 'w') as outfile:
                json.dump(dict(infobox), outfile)
            print "saved to %s"%json_file_name
            print 
        else : 
            with open(json_file_name, 'r') as outfile:
                companies_info[company_name] = json.load(outfile)

print "Fetched info about %s companies."%len(companies_info.keys())
44 companies
Fetched info about 44 companies.
In [77]:
# sort by categories

# Plateformes numériques :  Conglomerate (Alphabet + Tencent)  Internet (alibaba), Facebook, Amazon, Apple (>juillet 2008), Microsoft (>juin 2009)
# Technologie de l’information : Cloud computing (IBM), Computer software, Computer hardware, Consumer electronics, Apple (<juillet 2008), Microsoft (<juin 2009)
# Banque & finance: Banking, Conglomerate (Berskshire), Financial services
# Energie : Energy industry, Oil and gas, 
# Médical & Pharma : Pharmaceutical industry, Medical equipment
# Télécom : Telecommunications
# Distribution : Retail, Online shopping, Fast-moving consumer goods  (1 seul : Procter & Gamble)
# Equipements et resources : Conglomerate (GE), Automotive industry, Food processing, Metals,


# parse industry for each result
industries = {}

french_names = {
    "Plateformes numériques" : ["Internet", "Cloud computing", ],
    "Technologie de l’information" : ["Computer software", "Computer hardware", "Consumer electronics"],
    "Banque & Finance" : ["Bank", "Banking", "Financial services"],
    "Energie" : ["Energy industry", "Oil and gas", "Petroleum industry", "List of petroleum companies", "Oil and gas industry"],
    "Médical & Pharma" : ["Pharmaceutical industry", "Medical equipment"],
    "Distribution" : ["Retail", "Online shopping", "Fast-moving consumer goods"],
    "Télécom" : ["Telecommunication", "Telecommunications"],
    "Equipements & Ressources" : ["Automotive industry", "Food processing", "Metals"]
}

translations = {}
for fr in french_names: 
    for name in french_names[fr] :
        translations[name] = fr 
# print translations
        
for company_name in companies_info :
    info = companies_info[company_name]    
    s = info[u'industry']
    industry = s[s.find("[[")+1:s.find("]]")][1:].split("|")[0]
        
    if industry == "Conglomerate  (company)" or industry == "Conglomerate (company)":
        if company_name == "Berkshire Hathaway":
            industries[company_name] = "Banque & Finance"
        elif company_name == "General Electric":
            industries[company_name] = "Equipements & Ressources"
        elif company_name == "Alphabet Inc.":
            industries[company_name] = "Plateformes numériques"
        elif company_name == "Tencent":
            industries[company_name] = "Plateformes numériques"         
    else :
        if company_name == "Amazon.com": 
            industries[company_name] = "Plateformes numériques"
        else :
            industries[company_name] = translations[industry]
#         print company_name

# print industries.keys()
print 'Industries parsed.'
# print industries
Industries parsed.
In [78]:
# merge both
final = final_first.append(final_second)
print final.shape

# remap all values using updated data
final["Primary industry"] = final["Company"]
final = final.replace({ "Primary industry" : industries })
# print final.replace({ "Primary industry" : industries })

# move Apple and Microsoft after 2016
final.ix[ (final['Company'] == "Apple Inc.") & (final['Year'] > 2008), "Primary industry"] = "Plateformes numériques" 
final.ix[(final['Company'] == "Microsoft") & (final['Year'] > 2008), "Primary industry"] = "Plateformes numériques"

# some final clean
final = final.drop(["Notes", "Headquarters Link"], axis=1)
final["Market value"] = final["Market value"].str.split('[', expand=True)[0] # remove
final["Company Link"] = final["Company Link"].str[1:-1]
# final["Primary industry"] = final["Primary industry"].str.split('  ', expand=True)[0] # remove
final.head()

# save as file
final.to_csv("Data - List of public corporations by market capitalization - FINAL.csv", index=False)
print 'results saved to "./final.csv"'

final.head()
(480, 11)
results saved to "./final.csv"
Out[78]:
Company Company Link Headquarters Id Market value Primary industry Quarter Rank Year
0 Petrochina https://en.wikipedia.org/wiki/Petrochina China 2007-Q4-1 723,952 Energie Q4 1 2007
1 Exxon Mobil https://en.wikipedia.org/wiki/Exxon_Mobil United States 2007-Q4-2 511,887 Energie Q4 2 2007
2 General Electric https://en.wikipedia.org/wiki/General_Electric United States 2007-Q4-3 374,637 Equipements & Ressources Q4 3 2007
3 China Mobile https://en.wikipedia.org/wiki/China_Mobile Hong Kong 2007-Q4-4 354,120 Télécom Q4 4 2007
4 Industrial and Commercial Bank of China https://en.wikipedia.org/wiki/Industrial_and_C... China 2007-Q4-5 338,989 Banque & Finance Q4 5 2007
In [79]:
# sort result by year/quarter/industry

results_by_quarter = {}

# get all results by quarter
for index, row in final.sort_values("Id").iterrows():
    
    quarter = "%s-%s"%(row["Year"], row["Quarter"])
    
    try :
        results_by_quarter[quarter].append((row["Primary industry"], row["Market value"]))
    except : 
        results_by_quarter[quarter] = [ (row["Primary industry"], row["Market value"]) ]

# sum up results by industry        
final_results = {}

for q in results_by_quarter:
    q_results = {} 
    for res in results_by_quarter[q]:
        if res[1] is not None: 
            industry = res[0]
            amount = float(''.join(res[1].split(",")))
            
            try : 
                q_results[industry] = q_results[industry]+amount
            except KeyError:
                q_results[industry] = amount
    
    final_results[q] = q_results
    # make percent
#     total = sum([q_results[qr] for qr in q_results])
#     final_results[q] = { qr : q_results[qr]/total*100 for qr in q_results}

df = pd.DataFrame(final_results)
df.to_csv("Data - Market capitalization by Industries.csv")
print 'results saved.'
df.head()
results saved.
Out[79]:
2006-Q1 2006-Q2 2006-Q3 2006-Q4 2007-Q1 2007-Q2 2007-Q3 2007-Q4 2008-Q1 2008-Q2 ... 2015-Q3 2015-Q4 2016-Q1 2016-Q2 2016-Q3 2016-Q4 2017-Q1 2017-Q2 2017-Q3 2017-Q4
Banque & Finance 450641.0 661220.0 698952.0 768041.0 705822.0 470666.0 734497.0 338989.0 277236.0 439191.1 ... 318180.0 850646.0 595775.0 345860.0 358300.0 989937.54 1003157.0 418880.0 451840.0 860540.0
Distribution 196860.0 200762.0 NaN NaN NaN NaN NaN NaN 215640.0 188752.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Energie 1012510.0 1075604.0 1085531.3 1162849.0 889496.0 1215912.0 1038008.0 2084619.0 1396375.0 1128235.9 ... 304245.0 325167.0 350991.0 383396.0 358519.0 374280.00 339897.0 341947.0 348248.0 NaN
Equipements & Ressources 559258.0 342731.0 364414.0 624725.0 594443.0 621840.0 424191.0 374637.0 369569.0 NaN ... 481430.0 313892.0 295546.0 280927.0 261876.0 279545.92 NaN NaN NaN NaN
Médical & Pharma NaN NaN 206785.0 NaN NaN NaN NaN NaN NaN 156515.9 ... 498010.0 287153.0 300604.0 328234.0 320836.0 313432.46 337947.0 357310.0 347497.0 375360.0

5 rows × 48 columns

In [ ]:
 
In [ ]: